{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "from requests_html import HTMLSession\n",
    "import requests_html\n",
    "import pandas as pd\n",
    "import urllib.parse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# A1  nfu.edu.cn \n",
    "session = HTMLSession()\n",
    "r = session.get(\"https://www.nfu.edu.cn/ztb/index.htm\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 存\n",
    "with open (\"html_out/_nfu_招投标.html\", encoding = \"utf8\", mode = \"w\") as fp:\n",
    "    fp.write(r.html.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 读\n",
    "with open (\"html_out/_nfu_招投标.html\", encoding = \"utf8\", mode = \"r\") as fp:\n",
    "    html_load = fp.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Element html at 0x232513900e8>"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# # 解析\n",
    "# parsed = requests_html.soup_parse(html_load)\n",
    "# parsed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'https://www.nfu.edu.cn/ztb/4aa14103a6d34d42837fa4325389300f.htm'},\n",
       " {'https://www.nfu.edu.cn/ztb/ea8754261f26419080ae1933f5ae7f2a.htm'},\n",
       " {'https://www.nfu.edu.cn/ztb/7226fe9acf3b4757b972599c1c947ffe.htm'},\n",
       " {'https://www.nfu.edu.cn/ztb/414b2db5e6c04f99be1096effc050fe1.htm'},\n",
       " {'https://www.nfu.edu.cn/ztb/60c660848ef44283bcae0b864f06245b.htm'},\n",
       " {'https://www.nfu.edu.cn/ztb/c1f45c4ed6d24523b8015716f1354c69.htm'},\n",
       " {'https://www.nfu.edu.cn/ztb/8de22fa69c5a4718a5d3a234157a231a.htm'},\n",
       " {'https://www.nfu.edu.cn/ztb/84df006147494c74a06300e42ba5fe0f.htm'},\n",
       " {'https://www.nfu.edu.cn/ztb/5b6a96bc894e4901b9015550f495d48c.htm'},\n",
       " {'https://www.nfu.edu.cn/ztb/cee6034ea34b4d37af132488e5b08eba.htm'},\n",
       " {'https://www.nfu.edu.cn/ztb/2b0efb94d7bc43a69cf7705d8e5eb3fb.htm'},\n",
       " {'https://www.nfu.edu.cn/ztb/4ca38f35a904483aa17f8149b6e74a5f.htm'},\n",
       " {'https://www.nfu.edu.cn/ztb/d9a43543bfc04b249605e362c4d56fde.htm'},\n",
       " {'https://www.nfu.edu.cn/ztb/a94be158ee2d45629fa34fe777c524aa.htm'},\n",
       " {'https://www.nfu.edu.cn/ztb/6c02c38297c94f82a0b00e0c465ecf42.htm'},\n",
       " {'https://www.nfu.edu.cn/ztb/f312609072284e918844133b2e8d17de.htm'},\n",
       " {'https://www.nfu.edu.cn/ztb/711839de4a50406da99b621c5c60f53a.htm'},\n",
       " {'https://www.nfu.edu.cn/ztb/ef39ea1df91b4208859c3d139614b752.htm'},\n",
       " {'https://www.nfu.edu.cn/ztb/303ff597654847ad9fe7ec28cee402b3.htm'},\n",
       " {'https://www.nfu.edu.cn/ztb/3c3b1ca74f0b47e1a6f9a6e2434014aa.htm'}]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list_URL=[i.absolute_links for i in r.html.xpath('//div[@class=\"news_title\"]/a')]\n",
    "list_URL\n",
    "# 绝对路径，可直接得出链接，不需要再去拼接"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ParseResult(scheme='https', netloc='www.nfu.edu.cn', path='/ztb/index.htm', params='', query='', fragment='')"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 解析\n",
    "base_url = r.url\n",
    "nfu_urlparse = urllib.parse.urlparse(base_url)\n",
    "nfu_urlparse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/ztb/470506662bcd43b88d44962621026474.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/b68e118513bc4b57a1b35f82341b6810.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/c80c301c0ab14bc6abc813c94906f111.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/ece39ce32013478ab3d3c5ed1febd394.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/44f731db4eba4069b79c3e782f3ef9ed.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/cddb1b4019e34ed69d6106701aa4d470.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/c6e0933f03ca4e0286cf52071c1723b1.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/f942001af3dd46fca215fb0a6faee799.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/4542332eb5694cb5ad71e0354d15a430.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/5552cb526a9c435d9dde349a9ac7186a.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/6e1d73e9fd39479889d7fbf766097c08.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/3b2f91870cf04839a5658d12c720201b.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/6e76c23188ff4a558f96370098a5a126.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/9240c4ba726b44f5b80ac82fde99f5e1.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/ff5106c54a6c4a1fae81758374f787a2.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/63f0cba3d2fb4df9849df8fa9828adb8.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/65b2b6b7360f44a4ad3ad98f63a50419.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/a1296c7f2684415ca0f90b7281d05a6e.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/fe33fd9663c045dbb5fc674db275e06a.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/d7badf605aa447e3b34e65f3cff4f0e7.htm']"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 重组链接\n",
    "list_URL  = [urllib.parse.urlunparse\\\n",
    "([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "for detail_url in parsed.xpath('//div[@class=\"news_title\"]/a/@href')]\n",
    "list_URL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>标题</th>\n",
       "      <th>链结</th>\n",
       "      <th>日期</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>【广东科技报】全国大学生电子商务“三创赛”广东省选拔赛落幕 225支队伍同台竞技</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/470506662bcd43b88d4...</td>\n",
       "      <td>2018-06-15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>【南方+】奋力角逐！全国大学生电子商务挑战赛广东省选拔赛开赛</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/b68e118513bc4b57a1b...</td>\n",
       "      <td>2018-06-12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>【中国教育在线】广东省高校思想政治理论课教学改革发展研讨会在中山大学南方学院召开</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/c80c301c0ab14bc6abc...</td>\n",
       "      <td>2018-06-12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>【中国教育在线】225支队伍同台创意大比拼，第八届全国大学生电子商务“创新、创意及创业”挑战...</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/ece39ce32013478ab3d...</td>\n",
       "      <td>2018-06-12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>【中青在线】广东召开高校思想政治理论课教学改革发展研讨会</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/44f731db4eba4069b79...</td>\n",
       "      <td>2018-06-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>【广东省教育厅】“新时代、新思想、新教材”广东省高校思想政治理论课教学改革发展研讨会在中山大...</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/cddb1b4019e34ed69d6...</td>\n",
       "      <td>2018-06-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>【广东省教育厅】提高思想认识，传承优良品质，扎实推进我院党务技能提升——中山大学南方学院开展...</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/c6e0933f03ca4e0286c...</td>\n",
       "      <td>2018-05-31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>【广东省教育厅】中山大学南方学院第五届教师教学竞赛决赛圆满结束</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/f942001af3dd46fca21...</td>\n",
       "      <td>2018-05-31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>【新快报】广东省大中小学德育工作成果展在华工启动</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/4542332eb5694cb5ad7...</td>\n",
       "      <td>2018-05-30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>【广州参考】广东省大中小学德育工作成果展开展</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/5552cb526a9c435d9dd...</td>\n",
       "      <td>2018-05-30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>【新快报】高校社团组队来参赛 “未来领读者”迎报名首波热潮</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/6e1d73e9fd39479889d...</td>\n",
       "      <td>2018-05-30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>【广东省教育厅】传承淳朴家风，树立纯正学风，弘扬优良国风——中山大学南方学院举办“传家风·树...</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/3b2f91870cf04839a56...</td>\n",
       "      <td>2018-05-22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>【广东教育】广东省“师生健康中国健康”主题健康教育活动正式启动</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/6e76c23188ff4a558f9...</td>\n",
       "      <td>2018-04-28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>【搜狐网】广东省“师生健康中国健康”主题健康教育活动正式启动</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/9240c4ba726b44f5b80...</td>\n",
       "      <td>2018-04-28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>【广州市从化区人民政府】广东省主题健康教育活动在我区启动</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/ff5106c54a6c4a1fae8...</td>\n",
       "      <td>2018-04-28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>【南方网】广东省“师生健康中国健康”主题健康教育活动正式启动</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/63f0cba3d2fb4df9849...</td>\n",
       "      <td>2018-04-28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>【新快报】5 月防流感防手足口病，广东省“师生健康中国健康”主题健康教育活动启动</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/65b2b6b7360f44a4ad3...</td>\n",
       "      <td>2018-04-28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>【广州参考】省教育厅：近期各校要防流感防手足口病</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/a1296c7f2684415ca0f...</td>\n",
       "      <td>2018-04-28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>【中国教育在线】严防严控学校传染病疫情 广东省启动“师生健康中国健康”主题健康教育活动</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/fe33fd9663c045dbb5f...</td>\n",
       "      <td>2018-04-28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>【广东省教育厅】中山大学南方学院开展向卢永根同志学习系列活动</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/d7badf605aa447e3b34...</td>\n",
       "      <td>2018-04-23</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   标题  \\\n",
       "0            【广东科技报】全国大学生电子商务“三创赛”广东省选拔赛落幕 225支队伍同台竞技   \n",
       "1                      【南方+】奋力角逐！全国大学生电子商务挑战赛广东省选拔赛开赛   \n",
       "2            【中国教育在线】广东省高校思想政治理论课教学改革发展研讨会在中山大学南方学院召开   \n",
       "3   【中国教育在线】225支队伍同台创意大比拼，第八届全国大学生电子商务“创新、创意及创业”挑战...   \n",
       "4                        【中青在线】广东召开高校思想政治理论课教学改革发展研讨会   \n",
       "5   【广东省教育厅】“新时代、新思想、新教材”广东省高校思想政治理论课教学改革发展研讨会在中山大...   \n",
       "6   【广东省教育厅】提高思想认识，传承优良品质，扎实推进我院党务技能提升——中山大学南方学院开展...   \n",
       "7                     【广东省教育厅】中山大学南方学院第五届教师教学竞赛决赛圆满结束   \n",
       "8                            【新快报】广东省大中小学德育工作成果展在华工启动   \n",
       "9                              【广州参考】广东省大中小学德育工作成果展开展   \n",
       "10                      【新快报】高校社团组队来参赛 “未来领读者”迎报名首波热潮   \n",
       "11  【广东省教育厅】传承淳朴家风，树立纯正学风，弘扬优良国风——中山大学南方学院举办“传家风·树...   \n",
       "12                    【广东教育】广东省“师生健康中国健康”主题健康教育活动正式启动   \n",
       "13                     【搜狐网】广东省“师生健康中国健康”主题健康教育活动正式启动   \n",
       "14                       【广州市从化区人民政府】广东省主题健康教育活动在我区启动   \n",
       "15                     【南方网】广东省“师生健康中国健康”主题健康教育活动正式启动   \n",
       "16           【新快报】5 月防流感防手足口病，广东省“师生健康中国健康”主题健康教育活动启动   \n",
       "17                           【广州参考】省教育厅：近期各校要防流感防手足口病   \n",
       "18        【中国教育在线】严防严控学校传染病疫情 广东省启动“师生健康中国健康”主题健康教育活动   \n",
       "19                     【广东省教育厅】中山大学南方学院开展向卢永根同志学习系列活动   \n",
       "\n",
       "                                                   链结          日期  \n",
       "0   https://www.nfu.edu.cn/ztb/470506662bcd43b88d4...  2018-06-15  \n",
       "1   https://www.nfu.edu.cn/ztb/b68e118513bc4b57a1b...  2018-06-12  \n",
       "2   https://www.nfu.edu.cn/ztb/c80c301c0ab14bc6abc...  2018-06-12  \n",
       "3   https://www.nfu.edu.cn/ztb/ece39ce32013478ab3d...  2018-06-12  \n",
       "4   https://www.nfu.edu.cn/ztb/44f731db4eba4069b79...  2018-06-11  \n",
       "5   https://www.nfu.edu.cn/ztb/cddb1b4019e34ed69d6...  2018-06-11  \n",
       "6   https://www.nfu.edu.cn/ztb/c6e0933f03ca4e0286c...  2018-05-31  \n",
       "7   https://www.nfu.edu.cn/ztb/f942001af3dd46fca21...  2018-05-31  \n",
       "8   https://www.nfu.edu.cn/ztb/4542332eb5694cb5ad7...  2018-05-30  \n",
       "9   https://www.nfu.edu.cn/ztb/5552cb526a9c435d9dd...  2018-05-30  \n",
       "10  https://www.nfu.edu.cn/ztb/6e1d73e9fd39479889d...  2018-05-30  \n",
       "11  https://www.nfu.edu.cn/ztb/3b2f91870cf04839a56...  2018-05-22  \n",
       "12  https://www.nfu.edu.cn/ztb/6e76c23188ff4a558f9...  2018-04-28  \n",
       "13  https://www.nfu.edu.cn/ztb/9240c4ba726b44f5b80...  2018-04-28  \n",
       "14  https://www.nfu.edu.cn/ztb/ff5106c54a6c4a1fae8...  2018-04-28  \n",
       "15  https://www.nfu.edu.cn/ztb/63f0cba3d2fb4df9849...  2018-04-28  \n",
       "16  https://www.nfu.edu.cn/ztb/65b2b6b7360f44a4ad3...  2018-04-28  \n",
       "17  https://www.nfu.edu.cn/ztb/a1296c7f2684415ca0f...  2018-04-28  \n",
       "18  https://www.nfu.edu.cn/ztb/fe33fd9663c045dbb5f...  2018-04-28  \n",
       "19  https://www.nfu.edu.cn/ztb/d7badf605aa447e3b34...  2018-04-23  "
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 输出结果\n",
    "# B-D-1 pd.DataFrame 建构，pandas课有教\n",
    "df = pd.DataFrame( {\n",
    "         \"标题\": parsed.xpath('//div[@class=\"news_title\"]/a/@title'),\n",
    "         \"链结\": list_URL,\n",
    "         \"日期\": parsed.xpath('//font[@class=\"right-more\"]/text()'),\n",
    "     } )\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "# B-D-2 pd.DataFrame 输出excel，pandas课有教\n",
    "df.to_excel(\"data_out/nfu_招投.xlsx\", sheet_name=\"检索结果\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://www.nfu.edu.cn/ztb/index.htm'"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 第一页\n",
    "base_url_01 = r.url\n",
    "base_url_01"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SplitResult(scheme='https', netloc='www.nfu.edu.cn', path='/ztb/index.htm', query='', fragment='')"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "urllib.parse.urlsplit(base_url_01)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>第一页</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>https</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>www.nfu.edu.cn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>/ztb/index.htm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              第一页\n",
       "0           https\n",
       "1  www.nfu.edu.cn\n",
       "2  /ztb/index.htm\n",
       "3                \n",
       "4                "
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(urllib.parse.urlsplit(base_url_01)).rename({0:\"第一页\"},axis=1)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://www.nfu.edu.cn/ztb/index2.htm'"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "base_url_02 = session.get('https://www.nfu.edu.cn/ztb/index2.htm').url\n",
    "base_url_02"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>第一页</th>\n",
       "      <th>第二页</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>https</td>\n",
       "      <td>https</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>www.nfu.edu.cn</td>\n",
       "      <td>www.nfu.edu.cn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>/ztb/index.htm</td>\n",
       "      <td>/ztb/index2.htm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              第一页              第二页\n",
       "0           https            https\n",
       "1  www.nfu.edu.cn   www.nfu.edu.cn\n",
       "2  /ztb/index.htm  /ztb/index2.htm\n",
       "3                                 \n",
       "4                                 "
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['第二页'] = urllib.parse.urlsplit(base_url_02)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "22\n"
     ]
    }
   ],
   "source": [
    "for i in range(1,100):\n",
    "    r = session.get('https://www.nfu.edu.cn/ztb/index'+str(i)+'.htm')\n",
    "    if r.status_code != 200:\n",
    "        print(i)\n",
    "        break\n",
    "# so page = 19?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/ztb/index1.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index2.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index3.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index4.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index5.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index6.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index7.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index8.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index9.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index10.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index11.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index12.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index13.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index14.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index15.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index16.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index17.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index18.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index19.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index20.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index21.htm']"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "url_group = ['https://www.nfu.edu.cn/ztb/index'+str(i)+'.htm' for i in range(1,22)]\n",
    "url_group"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "url_group.insert(0,'https://www.nfu.edu.cn/ztb/index.htm')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.nfu.edu.cn/ztb/index.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index1.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index2.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index3.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index4.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index5.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index6.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index7.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index8.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index9.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index10.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index11.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index12.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index13.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index14.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index15.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index16.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index17.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index18.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index19.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index20.htm',\n",
       " 'https://www.nfu.edu.cn/ztb/index21.htm']"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "url_group"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/ztb/index.htm'"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "urllib.parse.urlparse(url_group[0]).path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "for url in url_group:\n",
    "    r = session.get(url)\n",
    "#     print(r.html.html)\n",
    "    path = urllib.parse.urlparse(url).path\n",
    "    with open ('html_out/'+path, encoding = \"utf8\", mode = \"w\") as fp:\n",
    "        fp.write(r.html.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "# xpath 准备：\n",
    "dict_xpath = {\n",
    "    '链接_xpath':'//div[@class=\"news_title\"]/a/@href',\n",
    "    '标题_xpath':'//div[@class=\"news_title\"]/a/@title',\n",
    "    '日期_xpath':'//font[@class=\"right-more\"]/text()'\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pages_content_url(parsed):\n",
    "    list_URL  = [urllib.parse.urlunparse\\\n",
    "                 ([nfu_urlparse.scheme,nfu_urlparse.netloc,'/'+ nfu_urlparse.path.split('/')[1] +'/' + detail_url,'','',''])\\\n",
    "                 for detail_url in parsed.xpath(dict_xpath['链接_xpath'])]\n",
    "    return list_URL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['index.htm', 'index1.htm', 'index10.htm', 'index11.htm', 'index12.htm', 'index13.htm', 'index14.htm', 'index15.htm', 'index16.htm', 'index17.htm', 'index18.htm', 'index19.htm', 'index2.htm', 'index20.htm', 'index21.htm', 'index3.htm', 'index4.htm', 'index5.htm', 'index6.htm', 'index7.htm', 'index8.htm', 'index9.htm']\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>标题</th>\n",
       "      <th>链结</th>\n",
       "      <th>日期</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>广州南方学院蚊子、苍蝇、蟑螂消杀及白蚁、红火蚁防治项目招标开标延期公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/4aa14103a6d34d42837...</td>\n",
       "      <td>2021-04-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>广州南方学院蚊子、苍蝇、蟑螂消杀及白蚁、红火蚁防治项目 招标公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/ea8754261f26419080a...</td>\n",
       "      <td>2021-04-02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>中山大学南方学院数字电路基础实验室、电路与模拟电子实验室设备采购项目招标公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/7226fe9acf3b4757b97...</td>\n",
       "      <td>2021-03-31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>中山大学南方学院垃圾清运和处理服务项目招标公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/414b2db5e6c04f99be1...</td>\n",
       "      <td>2021-03-17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>中山大学南方学院2021年度维修、改造工程施工项目中标结果公示</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/60c660848ef44283bca...</td>\n",
       "      <td>2021-03-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>289</th>\n",
       "      <td>9</td>\n",
       "      <td>中山大学南方学院学生体质健康测试仪采购项目招标公告（第二次）</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/ba01c43761e245d4937...</td>\n",
       "      <td>2015-03-27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>290</th>\n",
       "      <td>10</td>\n",
       "      <td>中山大学南方学院计算机实验室设备采购项目中标公示</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/0020f85b9ef24d0792d...</td>\n",
       "      <td>2015-03-26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>291</th>\n",
       "      <td>11</td>\n",
       "      <td>中山大学南方学院电气工程及自动化实验室设备采购项目招标公告（第二次）</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/10482a669fc54447aa2...</td>\n",
       "      <td>2015-03-26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>292</th>\n",
       "      <td>12</td>\n",
       "      <td>中山大学南方学院音乐楼阶梯课室座椅采购项目中标公示</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/4e5e67a17b7d47cf8cc...</td>\n",
       "      <td>2015-03-20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>293</th>\n",
       "      <td>13</td>\n",
       "      <td>中山大学南方学院室内高尔夫模拟设备项目招标公告</td>\n",
       "      <td>https://www.nfu.edu.cn/ztb/35a1b4dab36a4ae5aa4...</td>\n",
       "      <td>2013-12-23</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>434 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     index                                      标题  \\\n",
       "0        0     广州南方学院蚊子、苍蝇、蟑螂消杀及白蚁、红火蚁防治项目招标开标延期公告   \n",
       "1        1        广州南方学院蚊子、苍蝇、蟑螂消杀及白蚁、红火蚁防治项目 招标公告   \n",
       "2        2  中山大学南方学院数字电路基础实验室、电路与模拟电子实验室设备采购项目招标公告   \n",
       "3        3                 中山大学南方学院垃圾清运和处理服务项目招标公告   \n",
       "4        4         中山大学南方学院2021年度维修、改造工程施工项目中标结果公示   \n",
       "..     ...                                     ...   \n",
       "289      9          中山大学南方学院学生体质健康测试仪采购项目招标公告（第二次）   \n",
       "290     10                中山大学南方学院计算机实验室设备采购项目中标公示   \n",
       "291     11      中山大学南方学院电气工程及自动化实验室设备采购项目招标公告（第二次）   \n",
       "292     12               中山大学南方学院音乐楼阶梯课室座椅采购项目中标公示   \n",
       "293     13                 中山大学南方学院室内高尔夫模拟设备项目招标公告   \n",
       "\n",
       "                                                    链结          日期  \n",
       "0    https://www.nfu.edu.cn/ztb/4aa14103a6d34d42837...  2021-04-08  \n",
       "1    https://www.nfu.edu.cn/ztb/ea8754261f26419080a...  2021-04-02  \n",
       "2    https://www.nfu.edu.cn/ztb/7226fe9acf3b4757b97...  2021-03-31  \n",
       "3    https://www.nfu.edu.cn/ztb/414b2db5e6c04f99be1...  2021-03-17  \n",
       "4    https://www.nfu.edu.cn/ztb/60c660848ef44283bca...  2021-03-11  \n",
       "..                                                 ...         ...  \n",
       "289  https://www.nfu.edu.cn/ztb/ba01c43761e245d4937...  2015-03-27  \n",
       "290  https://www.nfu.edu.cn/ztb/0020f85b9ef24d0792d...  2015-03-26  \n",
       "291  https://www.nfu.edu.cn/ztb/10482a669fc54447aa2...  2015-03-26  \n",
       "292  https://www.nfu.edu.cn/ztb/4e5e67a17b7d47cf8cc...  2015-03-20  \n",
       "293  https://www.nfu.edu.cn/ztb/35a1b4dab36a4ae5aa4...  2013-12-23  \n",
       "\n",
       "[434 rows x 4 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "list_df = []\n",
    "\n",
    "\n",
    "files= os.listdir('html_out/ztb/')\n",
    "print(files)\n",
    "\n",
    "for html in files:\n",
    "    with open('html_out/ztb/'+html,encoding='utf8',mode='r') as fp:\n",
    "        html_load = fp.read()\n",
    "        parsed = requests_html.soup_parse(html_load)\n",
    "        list_URL = pages_content_url(parsed)\n",
    "        \n",
    "        df = pd.DataFrame( {\n",
    "         \"标题\": parsed.xpath(dict_xpath['标题_xpath']),\n",
    "         \"链结\": list_URL,\n",
    "         \"日期\": parsed.xpath(dict_xpath['日期_xpath']),\n",
    "        } )\n",
    "        list_df.append(df)\n",
    "\n",
    "        \n",
    "        \n",
    "df_all = pd.concat(list_df).reset_index().sort_values(by='日期',ascending=False)\n",
    "display(df_all)    \n",
    "\n",
    "with pd.ExcelWriter('data_out/nfu招投标.xlsx',mode='w',engine=\"openpyxl\") as writer:  \n",
    "            df_all.to_excel(writer, sheet_name='媒体报道')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
